import os
import pandas as pd
from sklearn import metrics
import plotly.express as px
from plotly.offline import init_notebook_mode
from typing import List, Dict, Callable, Iterable, Tuple
init_notebook_mode(connected=True)
os.getcwd()
'/Users/cem/Documents/BHT_datascience/master_thesis/master-thesis/src/notebook'
# set working directory to root of project
print('Notebook path:', os.getcwd())
os.chdir(f'{os.getcwd()}/../../')
print('New working directory', os.getcwd())
Notebook path: /Users/cem/Documents/BHT_datascience/master_thesis/master-thesis/src/notebook New working directory /Users/cem/Documents/BHT_datascience/master_thesis/master-thesis
# module imports
from src.Configurations import Configurations
# load all configurations
cfg = Configurations('src/configuration.yaml')
df_medic = pd.read_csv(f'{cfg.output_path_medic_all_metadata_updated_with_predictions}', sep='\t')
amount_of_sample = df_medic['image_id'].count()
amount_of_sample
71198
disaster_types_key = 'disaster_types'
informative_key = 'informative'
humanitarian_key = 'humanitarian'
damage_severity_key = 'damage_severity'
all_tasks = [disaster_types_key, informative_key, humanitarian_key, damage_severity_key]
def calculate_task_class_metrics_closure(tasks: List[str] = all_tasks) -> Callable:
def calculate_task_class_metrics(group_df: pd.DataFrame):
"""
Use ONLY for the classification variables (tasks)!
"""
def get_task_class_report_closure(_df: pd.DataFrame) -> Callable:
"""
Closure to return actual map function to calculate the class reports per
task.
Holds the dataframe containing the data in order to prevent side effects
and for efficiency reasons.
"""
def get_task_class_report(_task: str) -> pd.DataFrame:
"""
Extracts the class report as a dataframe for a single task.
"""
y: pd.Series = _df[_task]
y_hat: pd.Series = _df[f'prediction {_task}']
# averaging = 'weighted'
fallback_zero_division = 0
task_class_report: Dict[str, float] = metrics.classification_report(y, y_hat, output_dict=True, zero_division=fallback_zero_division)
task_class_report_df: pd.DataFrame = pd.DataFrame(task_class_report)\
.reset_index()\
.rename(columns={'index': 'metric'})
return task_class_report_df
return get_task_class_report
all_tasks_class_reports: Iterable[pd.DataFrame] = map(get_task_class_report_closure(group_df), tasks)
concatenated_task_class_reports_df: pd.DataFrame = pd.concat(all_tasks_class_reports, axis=0)
concatenated_task_class_reports_df = concatenated_task_class_reports_df.round(decimals=2)
return concatenated_task_class_reports_df
# return the actual function for using with .apply()
return calculate_task_class_metrics
def get_all_class_reports(medic_df: pd.DataFrame, task:str) -> pd.DataFrame:
task_columns_map = {
disaster_types_key: ['earthquake', 'fire', 'flood', 'hurricane', 'landslide', 'not_disaster', 'other_disaster'],
informative_key: ['informative', 'not_informative'],
humanitarian_key: ['affected_injured_or_dead_people', 'infrastructure_and_utility_damage', 'not_humanitarian', 'rescue_volunteering_or_donation_effort'],
damage_severity_key: ['little_or_none', 'mild', 'severe']
}
medic_df_class_report_all_samples = medic_df\
.groupby(['split'], dropna=False)\
.apply(calculate_task_class_metrics_closure(tasks=[task]))\
.droplevel(level=0)
medic_df_class_report_by_sensitive_groups = medic_df\
.groupby(['sensitive group'], dropna=False)\
.apply(calculate_task_class_metrics_closure(tasks=[task]))\
.reset_index(level=0)\
.fillna('not locatable')
medic_df_class_report_joined = pd.concat([medic_df_class_report_all_samples, medic_df_class_report_by_sensitive_groups])\
.fillna('all_samples')\
.melt(
id_vars=['metric', 'sensitive group'],
value_vars=task_columns_map[task],
var_name='class',
value_name='metric value'
)
return medic_df_class_report_joined
def plot_class_scatter_plot(class_report_df: pd.DataFrame, x='support', y='f1-score', color='class', hover_data=['sensitive group'], width=600, height=400, marker_size=12, plot_correlations=True):
df_for_scatter = class_report_df.pivot(columns=['metric'], index=['class', 'sensitive group'])
df_for_scatter.columns = df_for_scatter.columns.droplevel(level=0)
df_for_scatter = df_for_scatter.reset_index(level=[0, 1])
# todo: should be removed... concludes all_samples in the correlation calculation
# if plot_correlations:
# print('all classes:')
# print(df_for_scatter[['f1-score', 'precision', 'recall', 'support']].corr())
# print('every category for its own:')
# for category in df_for_scatter['class'].unique():
# print(category)
# # print(df_for_scatter[df_for_scatter['class'] == category])
# print(df_for_scatter[df_for_scatter['class'] == category][['f1-score', 'support']].corr())
_fig = px.scatter(
df_for_scatter,
x=x,
y=y,
hover_data=hover_data,
color=color,
symbol=color,
width=width,
height=height
)
_fig.update_layout(
font={'size': 8}
)
_fig.update_traces(marker_size=marker_size)
_fig.show()
def collect_all_support_correlations(task_class_reports: List[Tuple[str, pd.DataFrame]]):
f1s: List[List[float]] = []
supports: List[List[int]] = []
correlations: List[float] = []
contexts: List[str] = []
for task_name, class_report_df in task_class_reports:
df_for_scatter = class_report_df.pivot(columns=['metric'], index=['class', 'sensitive group'])
df_for_scatter.columns = df_for_scatter.columns.droplevel(level=0)
df_for_scatter = df_for_scatter.reset_index(level=[0, 1])
corr_all = df_for_scatter[df_for_scatter['sensitive group'] != 'all_samples'][['f1-score', 'support']].corr()
corr_coef_all = corr_all.iloc[0, 1]
correlations.append(corr_coef_all)
contexts.append(f'{task_name} - all classes')
f1s.append(list(df_for_scatter[df_for_scatter['sensitive group'] != 'all_samples']['f1-score']))
supports.append(list(df_for_scatter[df_for_scatter['sensitive group'] != 'all_samples']['support']))
for category in df_for_scatter['class'].unique():
corr_class = df_for_scatter[(df_for_scatter['class'] == category) & (df_for_scatter['sensitive group'] != 'all_samples')][['f1-score', 'support']].corr()
corr_coef_class = corr_class.iloc[0, 1]
correlations.append(corr_coef_class)
contexts.append(f'{task_name} - {category}')
f1s.append(list(df_for_scatter[(df_for_scatter['class'] == category) & (df_for_scatter['sensitive group'] != 'all_samples')]['f1-score']))
supports.append(list(df_for_scatter[(df_for_scatter['class'] == category) & (df_for_scatter['sensitive group'] != 'all_samples')]['support']))
correlations_df = pd.DataFrame({
'task (and class)': contexts,
'correlation f1 to support': correlations,
'f1-score values (A, B, C, n.l.)': f1s,
'support values (A, B, C, n.l.)': supports
})
return correlations_df
df_medic_train = df_medic[df_medic['split'] == 'train']
df_medic_dev = df_medic[df_medic['split'] == 'dev']
df_medic_test = df_medic[df_medic['split'] == 'test']
print('amount of samples in train/dev/test:', len(df_medic_train), len(df_medic_dev), len(df_medic_test))
amount of samples in train/dev/test: 49353 6157 15688
df_medic_test_disaster_types = get_all_class_reports(df_medic_test, task=disaster_types_key)
df_medic_test_informative = get_all_class_reports(df_medic_test, task=informative_key)
df_medic_test_humanitarian = get_all_class_reports(df_medic_test, task=humanitarian_key)
df_medic_test_damage_severity = get_all_class_reports(df_medic_test, task=damage_severity_key)
collect_all_support_correlations(
[
('disaster_type', df_medic_test_disaster_types),
('informative', df_medic_test_informative),
('humanitarian', df_medic_test_humanitarian),
('damage_severity', df_medic_test_damage_severity)
]
)
| task (and class) | correlation f1 to support | f1-score values (A, B, C, n.l.) | support values (A, B, C, n.l.) | |
|---|---|---|---|---|
| 0 | disaster_type - all classes | 0.584971 | [0.35, 0.8, 0.86, 0.57, 0.81, 0.46, 0.39, 0.78... | [94.0, 349.0, 929.0, 423.0, 266.0, 7.0, 29.0, ... |
| 1 | disaster_type - earthquake | 0.794854 | [0.35, 0.8, 0.86, 0.57] | [94.0, 349.0, 929.0, 423.0] |
| 2 | disaster_type - fire | 0.930358 | [0.81, 0.46, 0.39, 0.78] | [266.0, 7.0, 29.0, 388.0] |
| 3 | disaster_type - flood | 0.890944 | [0.78, 0.83, 0.51, 0.78] | [480.0, 339.0, 41.0, 455.0] |
| 4 | disaster_type - hurricane | 0.768808 | [0.66, 0.51, 0.27, 0.55] | [1066.0, 105.0, 53.0, 294.0] |
| 5 | disaster_type - landslide | 0.957287 | [0.2, 0.33, 0.17, 0.74] | [14.0, 4.0, 11.0, 302.0] |
| 6 | disaster_type - not_disaster | 0.110990 | [0.91, 0.9, 0.9, 0.87] | [3569.0, 1004.0, 1849.0, 2463.0] |
| 7 | disaster_type - other_disaster | 0.963903 | [0.13, 0.07, 0.21, 0.33] | [191.0, 37.0, 238.0, 688.0] |
| 8 | informative - all classes | -0.160969 | [0.82, 0.88, 0.86, 0.87, 0.87, 0.87, 0.85, 0.86] | [2234.0, 914.0, 1505.0, 2553.0, 3446.0, 931.0,... |
| 9 | informative - informative | -0.473840 | [0.82, 0.88, 0.86, 0.87] | [2234.0, 914.0, 1505.0, 2553.0] |
| 10 | informative - not_informative | 0.196733 | [0.87, 0.87, 0.85, 0.86] | [3446.0, 931.0, 1645.0, 2460.0] |
| 11 | humanitarian - all classes | 0.757010 | [0.2, 0.25, 0.59, 0.14, 0.82, 0.81, 0.86, 0.8,... | [94.0, 89.0, 361.0, 95.0, 1633.0, 618.0, 976.0... |
| 12 | humanitarian - affected_injured_or_dead_people | 0.970673 | [0.2, 0.25, 0.59, 0.14] | [94.0, 89.0, 361.0, 95.0] |
| 13 | humanitarian - infrastructure_and_utility_damage | -0.409335 | [0.82, 0.81, 0.86, 0.8] | [1633.0, 618.0, 976.0, 1997.0] |
| 14 | humanitarian - not_humanitarian | 0.287248 | [0.91, 0.89, 0.87, 0.86] | [3658.0, 967.0, 1675.0, 2845.0] |
| 15 | humanitarian - rescue_volunteering_or_donation... | 0.703318 | [0.38, 0.45, 0.21, 0.14] | [295.0, 171.0, 138.0, 76.0] |
| 16 | damage_severity - all classes | 0.668863 | [0.93, 0.9, 0.92, 0.87, 0.2, 0.22, 0.13, 0.16,... | [4080.0, 1115.0, 2008.0, 3049.0, 542.0, 226.0,... |
| 17 | damage_severity - little_or_none | 0.194649 | [0.93, 0.9, 0.92, 0.87] | [4080.0, 1115.0, 2008.0, 3049.0] |
| 18 | damage_severity - mild | 0.087564 | [0.2, 0.22, 0.13, 0.16] | [542.0, 226.0, 219.0, 540.0] |
| 19 | damage_severity - severe | -0.457275 | [0.7, 0.76, 0.83, 0.71] | [1058.0, 504.0, 923.0, 1424.0] |
plot_class_scatter_plot(df_medic_test_disaster_types, height=400, width=600)
plot_class_scatter_plot(df_medic_test_informative, height=400, width=600)
plot_class_scatter_plot(df_medic_test_humanitarian, height=400, width=600)
plot_class_scatter_plot(df_medic_test_damage_severity, height=400, width=600)